# pip install beautifulsoup4
# 官方中文文档
from bs4 import BeautifulSoup


html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

# 1.转换解析类型
# 默认bs4默认会调用系统中的lxm解析库, 如果未手动指定,会报一个警告  但不影响
soup = BeautifulSoup(html_doc, 'lxml')
# 2.格式化输出  会补全一些不完整的标签
result = soup.prettify()
print(result)

# 3.取head标签
result_head = soup.head
print(result_head)

# 取p标签  只能取到其中的一个标签
result_p = soup.p
print(result_p)

result_a = soup.a
# 取a标签的内容
result_a_str = soup.a.string
print(result_a_str)

# 取a标签的属性
result_a_attr = soup.a['href']
print(result_a_attr)




